Import Library

import pandas as pd
import json
import tensorflow as tf
import os
import re
import shutil
import numpy as np
import time
import matplotlib.pyplot as plt
import collections
import random
import requests
from math import sqrt
from PIL import Image
from tqdm.auto import tqdm
import matplotlib.image as mpimg
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.utils import shuffle
import keras.backend as K
keras = tf.keras
from keras.applications.inception_v3 import InceptionV3, preprocess_input
from keras.layers import Input, Dense, Embedding, RepeatVector, Dropout, Flatten, Concatenate, LSTM
from keras.callbacks import ModelCheckpoint
from keras.callbacks import EarlyStopping

from keras.models import Model
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.meteor_score import meteor_score
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import nltk
from rouge import Rouge
from torchmetrics.text.rouge import ROUGEScore
from pprint import pprint
from rouge_score import rouge_scorer

Create Dataframe with image paths and captions

  • A loop is used to iterate through each item in the loaded data.
  • For each item, the image ID is extracted, and the corresponding image name is formatted using leading zeros.
  • The full image path is created by concatenating the image_path_prefix and the formatted image name.
  • The caption associated with the image is also extracted.
# Define the path prefix for the images
image_path_prefix = "../data/Images"

# Load data from the JSON file
with open('../data/selected_data_10000.json', 'r') as json_file:
    data = json.load(json_file)

image_paths = []
captions = []

# Create image paths and captions using a loop
for item in data:
    image_id = item["image_id"]
    img_name = '%012d.jpg' % image_id
    image_path = f"{image_path_prefix}/{img_name}"
    caption = item["caption"]

    image_paths.append(image_path)
    captions.append(caption)

df = pd.DataFrame({'image': image_paths, 'caption': captions})
df.head(5)
image caption
0 ../data/Images/000000568872.jpg A man wearing earphones doing a trick on a ska...
1 ../data/Images/000000304252.jpg There is a surfer wearing a body suit riding a...
2 ../data/Images/000000539124.jpg The teenagers are standing together on the sid...
3 ../data/Images/000000225299.jpg A person jumping off a ramp with their snowboa...
4 ../data/Images/000000490875.jpg Two tennis players play a game while a crowd w...

Data Cleaning

  • The cleaning function is designed to preprocess and clean text data, making it more suitable for natural language processing tasks
def cleaning(text):
    # Convert to lowercase
    text = text.lower()
    # Remove non-alphanumeric characters and special characters
    text = re.sub(r'[^\w\s]', '', text)
    # Remove extra whitespaces
    text = re.sub('\s+', ' ', text)
    # Remove punctuation and special characters
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    # Remove leading and trailing whitespaces
    text = text.strip()
    # Add start and end tokens
    text = '[start] ' + text + ' [end]'
    return text
# Apply data cleaning to the captions column
df['caption'] = df['caption'].apply(cleaning)
# Load only captions that don't exceed a maximum length (e.g., 100 characters)
max_cap_len=15
df = df[df['caption'].apply(lambda x: len(x.split(" ")) <= max_cap_len)]
df.head()
image caption
0 ../data/Images/000000568872.jpg [start] a man wearing earphones doing a trick ...
1 ../data/Images/000000304252.jpg [start] there is a surfer wearing a body suit ...
2 ../data/Images/000000539124.jpg [start] the teenagers are standing together on...
3 ../data/Images/000000225299.jpg [start] a person jumping off a ramp with their...
4 ../data/Images/000000490875.jpg [start] two tennis players play a game while a...
print(len(df))
9035

Display some images with captions

  • Shows some image examples
# Display some images and captions
fig, axes = plt.subplots(nrows=3, ncols=1, figsize=(8, 6))

for ax, (image_path, caption) in zip(axes.flatten(), zip(df['image'][:3], df['caption'][:3])):
    img = mpimg.imread(image_path)
    ax.imshow(img)
    ax.set_title(caption)
    ax.axis('off')

plt.tight_layout()
plt.show()

Split the data

  • train_test_split is a function from sklearn.model_selection that is used to split the dataset into training and temporary sets initially, and then further split the temporary set into validation and test sets.
  • test_size=0.2 indicates that 80% of the data will be used for the initial test set, and test_size=0.5 indicates that 50% of the remaining data (temporary set) will be used for the validation set.
# Split the data into training, validation, and test sets
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Reset the index for each subset
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

# Display the sizes of each subset
print(f"Training set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")
print(f"Test set size: {len(test_df)}")
Training set size: 7228
Validation set size: 903
Test set size: 904

Setting parameters

# Image Dimension 
img_dimension = 299 
# Maximum Number of Words in Tokenizer Vocabulary
num_words = 10000 
# Size of the Image Encoding 
encoding_size = 256 
# Size of the LSTM (Long Short-Term Memory) units in the decoder
LSTM_size = 768 
# Batch Size for Training
batch_size = 64
# Number of Training Epochs
n_epochs = 15
# Buffer Size for Shuffling Training Data 
Buffer_size = 1000
# Number of Training Examples to Use 
num_examples = None  

Tokenizing and padding captions

  1. Tokenizer:
  • The Tokenizer class is used to tokenize the text data, converting words into numerical tokens.
  • The fit_on_texts method is applied to the training captions to build the vocabulary based on word frequency.
  1. Convert captions to sequences of integers:
  • The texts_to_sequences method is used to convert each caption in the training set into a sequence of integers based on the tokenized vocabulary.
  1. Pad sequences:
  • The pad_sequences function is used to ensure that all sequences have the same length by padding or truncating as needed. Padding is applied at the end of the sequences (‘post’), and the maximum length is set to max_cap_len + 2.
  1. Determine the number of unique words:
  • The number of unique words in the vocabulary is determined based on the total number of unique tokens in the tokenizer’s word index.
# # Create and fit the Tokenizer
tokenizer = Tokenizer(num_words = num_words, filters ="!?,'()_-+=&*$#@.",oov_token="")
tokenizer.fit_on_texts(train_df["caption"])
# Convert captions to sequences of integers
captions_train = tokenizer.texts_to_sequences(train_df["caption"])
# Pad sequences to a fixed length
captions_train = pad_sequences(captions_train, padding='post', maxlen=max_cap_len + 2)

# Determine the number of unique words in the vocabulary
words = len(tokenizer.word_index) if num_words is None else num_words
print("Unique words in vocabulary:", words)

# Print a few examples of original captions and their padded sequences
num_examples_to_print = 2
for i in range(num_examples_to_print):
    random_example_index = np.random.randint(low=0, high=len(captions_train) - 1, size=1)[0]
    original_caption = train_df["caption"].iloc[random_example_index]
    padded_sequence = captions_train[random_example_index]

    print(f"Original Caption: {original_caption}")
    print(f"Padded Sequence: {padded_sequence}")
    print("\n")
     
Unique words in vocabulary: 10000
Original Caption: [start] a giraffe having its nose rubbed by a tourist [end]
Padded Sequence: [   3    2  275  272  287  988 2406   51    2  876    4    0    0    0
    0    0    0]


Original Caption: [start] people look at books sitting on a table [end]
Padded Sequence: [   3   13  365   16 1006   20    5    2   35    4    0    0    0    0
    0    0    0]

# val dataset
captions_valid = tokenizer.texts_to_sequences(val_df["caption"])
captions_valid = pad_sequences(captions_valid, padding='post',maxlen=max_cap_len + 2)
# test data
captions_test = tokenizer.texts_to_sequences(test_df["caption"])
captions_test= pad_sequences(captions_test, padding='post',maxlen=max_cap_len + 2)

Preprocessing

  1. The load_img function is designed to read and preprocess images for a machine learning model. It takes a file path as input, reads the image using TensorFlow’s tf.io.read_file, and decodes the JPEG content with tf.image.decode_jpeg. The function then resizes the image to a specified dimension (assumed to be square) using tf.image.resize. Finally, the processed image is returned.
# load images
def load_img(path):
  img = tf.io.read_file(path)
  img = tf.image.decode_jpeg(img, channels=3)
  # resize the image
  img = tf.image.resize(img, (img_dimension, img_dimension))
  return img
  1. The preprocess_func_val function prepares data for validation in a sequence-to-sequence model. It loads an image specified by path_index from the validation dataset, processes the input and target captions by removing the last and first elements, respectively. Additionally, it initializes the hidden and cell states with zeros. The function returns a tuple containing the image, initial states, and the input caption, along with the target caption for validation.
  • train data
def preprocess_func(path_index, caption):
  # read the image
  path_index = tf.reshape(path_index, ())
  path = tf.gather(train_df["image"],indices=path_index)
  img = load_img(path)#/255.0
  # preprocessing text
  teacher_caption = caption[:-1]
  tar_caption = caption[1:]
  # initialize hidden and cell states with zeros
  h_and_c_init = tf.zeros((LSTM_size))
   
  return (img,h_and_c_init,teacher_caption), tar_caption 
  • validation data
def preprocess_func_val(path_index, caption):
  # read the image
  path_index = tf.reshape(path_index, ())
  path = tf.gather(val_df["image"],indices=path_index)
  img = load_img(path)#/255.0
  # preprocessing text
  teacher_caption = caption[:-1]
  tar_caption = caption[1:]
  # initialize hidden and cell states with zeros
  h_and_c_init = tf.zeros((LSTM_size))
  
  return (img,h_and_c_init,teacher_caption), tar_caption 
  • test data
def preprocess_func_test(path_index, caption):
  # reading the image
  path_index = tf.reshape(path_index, ())
  path = tf.gather(test_df["image"],indices=path_index)
  img = load_img(path) #/255.0
  # preprocessing text
  teacher_caption = caption[:-1]
  tar_caption = caption[1:]
  # initialize hidden and cell states with zeros
  h_and_c_init = tf.zeros((LSTM_size))
   
  return (img, h_and_c_init, teacher_caption), tar_caption 
  1. creates two TensorFlow datasets
  • path_index_vec_train is created as an array indexing each image path in the training set.
  • Datasets (dataset1_train and dataset2_train) are created from the index array and the corresponding preprocessed captions.
  • tf.data.Dataset.zip combines these datasets into a single dataset.
  • The preprocess_func is applied to each element in parallel, preparing input-output pairs for the model.
  • The dataset is shuffled, batched, and prefetching is applied to enhance training performance.
# creating an array to index each img path for reading 
#train
path_index_vec_train = np.array(list(range(0,len(train_df["image"])))).reshape(-1,1) 
dataset1_train = tf.data.Dataset.from_tensor_slices(path_index_vec_train)
dataset2_train = tf.data.Dataset.from_tensor_slices(captions_train)
dataset = tf.data.Dataset.zip((dataset1_train,dataset2_train))
dataset = dataset.map(preprocess_func, num_parallel_calls=tf.data.experimental.AUTOTUNE)

#val
path_index_vec_val = np.array(list(range(0,len(val_df["image"])))).reshape(-1,1) 
dataset1_val = tf.data.Dataset.from_tensor_slices(path_index_vec_val)
dataset2_val = tf.data.Dataset.from_tensor_slices(captions_valid)
dataset_val = tf.data.Dataset.zip((dataset1_val,dataset2_val))
dataset_val = dataset_val.map(preprocess_func_val, num_parallel_calls=tf.data.experimental.AUTOTUNE)

# test data
path_index_vec_test = np.array(list(range(0,len(test_df["image"])))).reshape(-1,1)
dataset1_test = tf.data.Dataset.from_tensor_slices(path_index_vec_test)
dataset2_test = tf.data.Dataset.from_tensor_slices(captions_test)
dataset_test = tf.data.Dataset.zip((dataset1_test,dataset2_test))
dataset_test = dataset_test.map(preprocess_func_test, num_parallel_calls=tf.data.experimental.AUTOTUNE)
dataset = dataset.shuffle(Buffer_size).batch(batch_size).prefetch(1)
dataset_val = dataset_val.shuffle(Buffer_size).batch(256).prefetch(1)

Building the model

  • This code defines a neural network model for image captioning using the InceptionV3 architecture as a feature extractor. The InceptionV3 model is loaded with pre-trained weights, and its layers are frozen to prevent further training. The input layers include tensors for the image, teacher forcing input (caption sequence), and initial hidden state for the LSTM layer. The image input is preprocessed, encoded using a Dense layer, and flattened. The encoding is then repeated for each time step in the caption sequence. The teacher forcing input is embedded and subjected to dropout. The flattened encoding and embedded input are concatenated and fed into an LSTM layer for decoding the sequence. The LSTM output is passed through a Dense layer to predict word probabilities.
# Load InceptionV3 with pre-trained weights and freeze layers
incep_model = InceptionV3(input_shape=(img_dimension,img_dimension,3),
                                                    include_top=False)
incep_model.trainable=False
# Input layers for the model
img = Input(shape=(img_dimension, img_dimension, 3))  # Image input tensor
teacher_forcing = Input(shape=(max_cap_len + 1))  # Teacher forcing input tensor
initial_state = Input(shape=(LSTM_size))  # Initial hidden state tensor for LSTM

# Preprocess the image input using the specified preprocessing function
img_preprocessed = preprocess_input(img)

# Encode the preprocessed image using the InceptionV3 model
encoding = incep_model(img_preprocessed)
encoding_layer = Dense(encoding_size, activation='relu')  # Dense layer for image encoding
encoding = encoding_layer(encoding)
encoding = Flatten()(encoding)  # Flatten the encoding to be used as input in subsequent layers

# Repeat the flattened encoding for each time step in the caption
encoding = RepeatVector(max_cap_len + 1)(encoding)

# Embedding layer for the teacher forcing input and apply dropout
embedding_layer = Embedding(words+1, 256, mask_zero=True)
dropout_on_embbeds = Dropout(0.15)

embeddings = embedding_layer(teacher_forcing)
embeddings = dropout_on_embbeds(embeddings)

# Concatenate the flattened encoding and embedded teacher forcing input
concatenated_input = Concatenate()([encoding, embeddings])

# LSTM layer for decoding the sequence
decoder_lstm_layer = LSTM(LSTM_size, return_sequences=True, return_state=True, dropout=0.2)
h = initial_state
c = initial_state

# Apply the LSTM layer to the concatenated input with initial hidden and cell states
lstm_output, h, c = decoder_lstm_layer(concatenated_input, initial_state=[h, c])

# Dense layer for outputting the predicted word probabilities
decoder_dense_layer = Dense(words + 1, activation=None, name='output_layer')
output_probabilities = decoder_dense_layer(lstm_output)

# Build the model
model = Model([img, initial_state, teacher_forcing], output_probabilities)
model.summary()
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
==================================================================================================
 input_2 (InputLayer)           [(None, 299, 299, 3  0           []                               
                                )]                                                                
                                                                                                  
 tf.math.truediv (TFOpLambda)   (None, 299, 299, 3)  0           ['input_2[0][0]']                
                                                                                                  
 tf.math.subtract (TFOpLambda)  (None, 299, 299, 3)  0           ['tf.math.truediv[0][0]']        
                                                                                                  
 inception_v3 (Functional)      (None, 8, 8, 2048)   21802784    ['tf.math.subtract[0][0]']       
                                                                                                  
 dense (Dense)                  (None, 8, 8, 256)    524544      ['inception_v3[0][0]']           
                                                                                                  
 input_3 (InputLayer)           [(None, 16)]         0           []                               
                                                                                                  
 flatten (Flatten)              (None, 16384)        0           ['dense[0][0]']                  
                                                                                                  
 embedding (Embedding)          (None, 16, 256)      2560256     ['input_3[0][0]']                
                                                                                                  
 repeat_vector (RepeatVector)   (None, 16, 16384)    0           ['flatten[0][0]']                
                                                                                                  
 dropout (Dropout)              (None, 16, 256)      0           ['embedding[0][0]']              
                                                                                                  
 concatenate_2 (Concatenate)    (None, 16, 16640)    0           ['repeat_vector[0][0]',          
                                                                  'dropout[0][0]']                
                                                                                                  
 input_4 (InputLayer)           [(None, 768)]        0           []                               
                                                                                                  
 lstm (LSTM)                    [(None, 16, 768),    53480448    ['concatenate_2[0][0]',          
                                 (None, 768),                     'input_4[0][0]',                
                                 (None, 768)]                     'input_4[0][0]']                
                                                                                                  
 output_layer (Dense)           (None, 16, 10001)    7690769     ['lstm[0][0]']                   
                                                                                                  
==================================================================================================
Total params: 86,058,801
Trainable params: 64,256,017
Non-trainable params: 21,802,784
__________________________________________________________________________________________________

Loss Function

  • The provided code defines a custom loss function named sparse_it_up for the image captioning model. This loss function is designed to handle sparse categorical cross-entropy with masking, which is suitable for tasks where only specific elements in the sequence need to be considered for loss calculation. The function takes two arguments, y_true and y_preds, representing the true labels and predicted probabilities, respectively. It first casts the true labels to integers and creates a mask to identify non-zero elements. The true labels are then converted to one-hot encoding. Finally, the function computes the categorical cross-entropy by considering the masked elements and returns the mean loss.
# loss function
def sparse_it_up(y_true, y_preds):
    # Cast true labels to integers
    y_true = tf.cast(y_true, tf.int32)
    # Create a mask for non-zero elements in true labels
    mask = tf.math.logical_not(tf.math.equal(y_true, 0))
    mask = tf.cast(mask, dtype=tf.float32)
    # Convert true labels to one-hot encoding
    y_true_one_hot = tf.one_hot(y_true, words + 1)
    # Calculate categorical cross-entropy with masking
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(y_true_one_hot, y_preds) * mask)
    return loss

Model Compile

# model compile
model.compile(optimizer=keras.optimizers.legacy.Nadam(0.001), loss = sparse_it_up, metrics=['accuracy'])
# Model Checkpoint Callback
Checkpoint = ModelCheckpoint('Image_captioning.hdf5',save_best_only=True, monitor='val_accuracy',verbose =1)
# Early Stopping Callback
early_stop = EarlyStopping(monitor ='val_accuracy',patience=5)
# model fit
history = model.fit(dataset, epochs=20, validation_data=dataset_val, callbacks=[early_stop,Checkpoint])
Epoch 1/20
2023-11-29 23:53:29.945489: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_2' with dtype int32 and shape [7228,17]
     [[{{node Placeholder/_2}}]]
2023-11-29 23:53:29.945766: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int64 and shape [7228,1]
     [[{{node Placeholder/_0}}]]
2023-11-29 23:53:33.610559: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
113/113 [==============================] - ETA: 0s - loss: 3.6942 - accuracy: 0.1578 
2023-11-30 00:16:02.849924: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_2' with dtype int32 and shape [903,17]
     [[{{node Placeholder/_2}}]]
2023-11-30 00:16:02.850420: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int64 and shape [903,1]
     [[{{node Placeholder/_0}}]]

Epoch 1: val_accuracy improved from -inf to 0.24426, saving model to Image_captioning.hdf5
113/113 [==============================] - 1471s 13s/step - loss: 3.6942 - accuracy: 0.1578 - val_loss: 3.2062 - val_accuracy: 0.2443
Epoch 2/20
113/113 [==============================] - ETA: 0s - loss: 2.8184 - accuracy: 0.2809 
Epoch 2: val_accuracy improved from 0.24426 to 0.31200, saving model to Image_captioning.hdf5
113/113 [==============================] - 1486s 13s/step - loss: 2.8184 - accuracy: 0.2809 - val_loss: 2.6304 - val_accuracy: 0.3120
Epoch 3/20
113/113 [==============================] - ETA: 0s - loss: 2.3815 - accuracy: 0.3284 
Epoch 3: val_accuracy improved from 0.31200 to 0.33924, saving model to Image_captioning.hdf5
113/113 [==============================] - 1483s 13s/step - loss: 2.3815 - accuracy: 0.3284 - val_loss: 2.4592 - val_accuracy: 0.3392
Epoch 4/20
113/113 [==============================] - ETA: 0s - loss: 2.1069 - accuracy: 0.3610 
Epoch 4: val_accuracy improved from 0.33924 to 0.34702, saving model to Image_captioning.hdf5
113/113 [==============================] - 1495s 13s/step - loss: 2.1069 - accuracy: 0.3610 - val_loss: 2.4011 - val_accuracy: 0.3470
Epoch 5/20
113/113 [==============================] - ETA: 0s - loss: 1.8315 - accuracy: 0.4046 
Epoch 5: val_accuracy improved from 0.34702 to 0.34894, saving model to Image_captioning.hdf5
113/113 [==============================] - 1499s 13s/step - loss: 1.8315 - accuracy: 0.4046 - val_loss: 2.3993 - val_accuracy: 0.3489
Epoch 6/20
113/113 [==============================] - ETA: 0s - loss: 1.5184 - accuracy: 0.4671 
Epoch 6: val_accuracy improved from 0.34894 to 0.34930, saving model to Image_captioning.hdf5
113/113 [==============================] - 1491s 13s/step - loss: 1.5184 - accuracy: 0.4671 - val_loss: 2.4112 - val_accuracy: 0.3493
Epoch 7/20
113/113 [==============================] - ETA: 0s - loss: 1.1674 - accuracy: 0.5583 
Epoch 7: val_accuracy did not improve from 0.34930
113/113 [==============================] - 1502s 13s/step - loss: 1.1674 - accuracy: 0.5583 - val_loss: 2.4897 - val_accuracy: 0.3433
Epoch 8/20
113/113 [==============================] - ETA: 0s - loss: 0.8197 - accuracy: 0.6705 
Epoch 8: val_accuracy did not improve from 0.34930
113/113 [==============================] - 1427s 13s/step - loss: 0.8197 - accuracy: 0.6705 - val_loss: 2.5623 - val_accuracy: 0.3401
Epoch 9/20
113/113 [==============================] - ETA: 0s - loss: 0.5306 - accuracy: 0.7709 
Epoch 9: val_accuracy did not improve from 0.34930
113/113 [==============================] - 1427s 13s/step - loss: 0.5306 - accuracy: 0.7709 - val_loss: 2.6495 - val_accuracy: 0.3345
Epoch 10/20
113/113 [==============================] - ETA: 0s - loss: 0.3341 - accuracy: 0.8356  
Epoch 10: val_accuracy did not improve from 0.34930
113/113 [==============================] - 10346s 92s/step - loss: 0.3341 - accuracy: 0.8356 - val_loss: 2.7407 - val_accuracy: 0.3398
Epoch 11/20
113/113 [==============================] - ETA: 0s - loss: 0.2142 - accuracy: 0.8712 
Epoch 11: val_accuracy did not improve from 0.34930
113/113 [==============================] - 6295s 56s/step - loss: 0.2142 - accuracy: 0.8712 - val_loss: 2.8099 - val_accuracy: 0.3378
# Load pre-trained weights into the model
model.load_weights("Image_captioning.hdf5")

Loss and Accuracy Plots

# Plotting Training and Validation Loss
epochs = range(1, len(history.history['accuracy']) + 1)
training_loss = history.history['loss']
validation_loss = history.history['val_loss']

plt.plot(epochs, training_loss, 'o-', label='Training Loss', color='blue')
plt.plot(epochs, validation_loss, 'o-', label='Validation Loss', color='orange')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.show()

# Plotting Training and Validation Accuracy
epochs = range(1, len(history.history['accuracy']) + 1)
training_accuracy = history.history['accuracy']
validation_accuracy = history.history['val_accuracy']

plt.plot(epochs, training_accuracy, 'o-', label='Training Accuracy', color='blue')
plt.plot(epochs, validation_accuracy, 'o-', label='Validation Accuracy', color='orange')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Training and Validation Accuracy')
plt.legend()
plt.show()

Evaluating the model on test dataset

model.evaluate(dataset_test.batch(256).prefetch(1))
2023-11-30 08:12:13.967593: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_2' with dtype int32 and shape [904,17]
     [[{{node Placeholder/_2}}]]
4/4 [==============================] - 103s 25s/step - loss: 2.4127 - accuracy: 0.3423
[2.4126815795898438, 0.34227755665779114]
predictions = model.predict(dataset_test.batch(256).prefetch(1))
2023-11-30 14:59:10.956424: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_2' with dtype int32 and shape [904,17]
     [[{{node Placeholder/_2}}]]
4/4 [==============================] - 163s 39s/step
predictions[1]
array([[-11.754684  , -11.630219  ,   8.127428  , ..., -11.698658  ,
        -11.739982  , -11.715563  ],
       [ -9.396809  ,  -9.369763  ,   0.45839906, ...,  -9.284527  ,
         -9.443345  ,  -9.198584  ],
       [-12.368148  , -12.627315  ,   2.681948  , ..., -12.519609  ,
        -12.549717  , -12.664832  ],
       ...,
       [-11.306902  , -11.330777  ,   1.9682028 , ..., -11.280667  ,
        -11.753964  , -11.522484  ],
       [-11.306902  , -11.330777  ,   1.9682028 , ..., -11.280667  ,
        -11.753964  , -11.522484  ],
       [-11.306902  , -11.330777  ,   1.9682028 , ..., -11.280667  ,
        -11.753964  , -11.522484  ]], dtype=float32)

Recall the model

# Load the saved model
from keras.models import load_model
from keras.utils import custom_object_scope

with custom_object_scope({'sparse_it_up': sparse_it_up}):
    loaded_model = load_model('Image_captioning.hdf5')

Make Predicted Captions on Test Set

# Make predictions
predictions = loaded_model.predict(dataset_test.batch(256).prefetch(1))
predicted_indices = np.argmax(predictions, axis=-1)
predicted_words = [tokenizer.index_word[index] for index in predicted_indices.flatten()]
predicted_words = np.array(predicted_words).reshape(predicted_indices.shape)

generated_captions_list = []

# Print or use the generated captions as needed
for i in range(len(predicted_words)):
    # Find the position of the [end] token in the generated caption
    end_token_position = np.where(predicted_words[i] == "[end]")[0]
    
    if len(end_token_position) > 0:
        generated_caption = ' '.join(predicted_words[i][:end_token_position[0] + 1])
    else:
        # If [end] is not found, add it to the end of the caption
        generated_caption = ' '.join(predicted_words[i]) + " [end]"

    # Add [start] at the beginning of each caption
    generated_caption = "[start] " + generated_caption
    generated_captions_list.append(generated_caption)

print(generated_captions_list[:5])
2023-11-30 17:29:35.140461: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_2' with dtype int32 and shape [904,17]
     [[{{node Placeholder/_2}}]]
4/4 [==============================] - 114s 27s/step
['[start] a are a baseball on a on a [end]', '[start] a man of people are a a game cake [end]', '[start] a man is sitting on a to on [end]', '[start] a child child is holding on a bear [end]', '[start] small child holding a child bear with to a teddy playing playing playing playing playing playing [end]']
# Display the image and captions
def caption_image2(path, true_caption, generated_captions_list, index):
    image = load_img(path)

    predicted_caption = generated_captions_list[index]

    print(f"Label: {true_caption}")
    print(f"Predicted: {predicted_caption}")
    plt.figure(figsize=(5, 5))
    plt.imshow(image / 255.0) 
    plt.axis("off")
    plt.show()

Some Outputs

# Example usage:
caption_image2(test_df["image"][100], test_df["caption"][100], generated_captions_list, 100)
Label: [start] a man on a snowboard who is performing a jump [end]
Predicted: [start] a person riding a snowboard with is skiing a snowboard [end]

# Example usage:
caption_image2(test_df["image"][200], test_df["caption"][200], generated_captions_list, 200)
Label: [start] a woman swinging a tennis racquet on a tennis court [end]
Predicted: [start] a man is a tennis ball on a tennis court [end]

# Example usage:
caption_image2(test_df["image"][300], test_df["caption"][300], generated_captions_list, 300)
Label: [start] two women are standing in front of a grocery store refrigerator [end]
Predicted: [start] two women standing standing in a of a market market [end]

# Example usage:
caption_image2(test_df["image"][370], test_df["caption"][370], generated_captions_list, 370)
Label: [start] a couple of men standing next to a street sign [end]
Predicted: [start] a street of a walking on to a street sign [end]

caption_image2(test_df["image"][390], test_df["caption"][390], generated_captions_list, 390)
Label: [start] a city bus traveling on a road near a street sign [end]
Predicted: [start] a tour and is down a street next a bus [end]

caption_image2(test_df["image"][500], test_df["caption"][500], generated_captions_list, 500)
Label: [start] a man riding a skateboard down a sidewalk [end]
Predicted: [start] a man riding a skateboard while a skateboard [end]

caption_image2(test_df["image"][510], test_df["caption"][510], generated_captions_list, 510)
Label: [start] a park bench sits in front of a pond [end]
Predicted: [start] a person sitting on on a of a park bench [end]

caption_image2(test_df["image"][520], test_df["caption"][520], generated_captions_list, 520)
Label: [start] a kid with a piece of bread in hand standing [end]
Predicted: [start] a little in a baby of a in a [end]

caption_image2(test_df["image"][550], test_df["caption"][550], generated_captions_list, 550)
Label: [start] some people and a woman with a red frisbee [end]
Predicted: [start] a men are a frisbee standing a frisbee field [end]

caption_image2(test_df["image"][570], test_df["caption"][570], generated_captions_list, 570)
Label: [start] a skier skiing and wearing a competitive bib [end]
Predicted: [start] a man and down a the snow covered [end]

caption_image2(test_df["image"][120], test_df["caption"][120], generated_captions_list, 120)
Label: [start] a child laying on a bed in a room [end]
Predicted: [start] a small is on a bed with a bed [end]

caption_image2(test_df["image"][180], test_df["caption"][180], generated_captions_list, 180)
Label: [start] people buying vegetables from an outdoor market vendor [end]
Predicted: [start] a are at at the outdoor market at at [end]

caption_image2(test_df["image"][800], test_df["caption"][800], generated_captions_list, 800)
Label: [start] three different men with racquets on a tennis court [end]
Predicted: [start] a tennis tennis playing a at a tennis court [end]

caption_image2(test_df["image"][590], test_df["caption"][590], generated_captions_list, 590)
Label: [start] man on a tennis court holding tennis racket and ball [end]
Predicted: [start] a on a tennis court on a ball on hit [end]

caption_image2(test_df["image"][230], test_df["caption"][230], generated_captions_list, 230)
Label: [start] a man is holding what seems to be a banana [end]
Predicted: [start] a man holding holding a and for a [end]

caption_image2(test_df["image"][240], test_df["caption"][240], generated_captions_list, 240)
Label: [start] a person falling in the sand after catching a frisbee [end]
Predicted: [start] a group walking a the beach with a the beach [end]

Evaluate Scores

Bleu Score & Meteor scores & ROUGE score on test set - BLEU (Bilingual Evaluation Understudy) Score: Originally used to evaluate the quality of machine translation, the BLEU score works by comparing the overlap of machine-generated text with a set of reference texts (i.e. correct texts written by humans). It mainly focuses on n-gram matching. - METEOR (Metric for Evaluation of Translation with Explicit Ordering) Score: Similar to BLEU, but it takes into account the matching of synonyms and sentence structure, so it is generally considered to reflect human evaluation more accurately than BLEU. - ROUGE (Recall-Oriented Understudy for Gisting Evaluation) Score: Mainly used to evaluate automatic summarization tasks, but also suitable for image description. It calculates the overlap between reference text and generated text, focusing on recall.

The BLEU score indicates the overlap between the generated and true captions, while the METEOR score considers additional factors such as stemming and synonymy.

  • BLEU & METEOR Score
nltk.download('wordnet')
reference_captions = test_df["caption"]

# Function to calculate BLEU and METEOR scores
def calculate_scores(reference_captions, generated_captions):
    smoothie = SmoothingFunction().method4
    bleu_scores = []
    meteor_scores = []

    for ref, gen in zip(reference_captions, generated_captions):
        ref = ref.split()  
        gen = gen.split()

        # BLEU score
        bleu = sentence_bleu([ref], gen, smoothing_function=smoothie)
        bleu_scores.append(bleu)

        # METEOR score
        meteor = meteor_score([ref], gen)
        meteor_scores.append(meteor)

    return bleu_scores, meteor_scores

# Calculate BLEU and METEOR scores
bleu_scores, meteor_scores = calculate_scores(reference_captions, generated_captions_list)

# # Print the individual BLEU and METEOR scores
# for i, (bleu, meteor) in enumerate(zip(bleu_scores, meteor_scores), start=1):
#     print(f'Sample {i}: BLEU score: {bleu}, METEOR score: {meteor}')

# Calculate average BLEU and METEOR scores
average_bleu = sum(bleu_scores) / len(bleu_scores)
average_meteor = sum(meteor_scores) / len(meteor_scores)
print(f'Average BLEU score of test set: {average_bleu}')
print(f'Average METEOR score of test set: {average_meteor}')
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/xiaodanlu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Average BLEU score of test set: 0.1028387192495446
Average METEOR score of test set: 0.33695010131979236
  • ROUGE Score
hyp = [' '.join(h) for h in generated_captions_list]
ref = [' '.join(r) for r in test_df["image"]]

# make a RougeScorer object with rouge_types=['rouge1']
scorer = rouge_scorer.RougeScorer(['rouge1'])
results = {'precision': [], 'recall': [], 'fmeasure': []}

for (h, r) in zip(hyp, ref):
    score = scorer.score(h, r)
    precision, recall, fmeasure = score['rouge1']
    results['precision'].append(precision)
    results['recall'].append(recall)
    results['fmeasure'].append(fmeasure)

# Calculate average 
average_precision = sum(results['precision']) / len(results['precision'])
average_recall = sum(results['recall']) / len(results['recall'])
average_fmeasure = sum(results['fmeasure']) / len(results['fmeasure'])

print(f'Rouge score on test set: Average Precision: {average_precision}')
print(f'Rouge score on test set: Average Recall: {average_recall}')
print(f'Rouge score on test set: Average F1 Measure: {average_fmeasure}')
Rouge score on test set: Average Precision: 0.39185840707964603
Rouge score on test set: Average Recall: 0.2592120385251268
Rouge score on test set: Average F1 Measure: 0.3086467535872159
  • Plot
average_bleu = average_bleu
average_meteor = average_meteor
average_Rouge = average_fmeasure
# Labels 
labels = ['BLEU', 'METEOR','ROUGE']
# Scores 
scores = [average_bleu, average_meteor, average_Rouge]
# Plotting
plt.bar(labels, scores, color=['blue', 'orange','red'])
plt.ylim(0, 1)  
for i, score in enumerate(scores):
    plt.text(i, score + 0.01, f'{score:.4f}', ha='center')
plt.xlabel('Metrics')
plt.ylabel('Scores')
plt.title('Average BLEU, METEOR, and ROUGE Scores on Test Set')
plt.show()